notebook.community

Edit and run



In [1]:

    
# %load database-reader.py
import sqlite3
import pandas as pd
from xml.etree import ElementTree

Origin. 3040 raw match obersvations for English Premier League After merged with team arribute. it reduces to 2393 observations.

Missing Data: 1 from match table. we discard PS, SJ, GB, BS betting company daa + player featuures. focus on the Primary League only. 2 discard additional four lines with empty features



In [2]:

    
db_location = "soccer/database.sqlite"

conn = sqlite3.connect(db_location)



In [3]:

    
df_match = pd.read_sql(sql="SELECT id,league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, goal, shoton, shotoff, foulcommit, card, cross, corner, possession, B365H, B365D, B365A, BWH, BWD, BWA, IWH, IWD, IWA, LBH, LBD, LBA, WHH,WHD,WHA,VCH,VCD,VCA FROM Match WHERE league_id=1729 AND date >= '2010-02-22'", con=conn)
df_match=df_match.dropna()
df_match.head()









    Out[3]:







  
    
      
      id
      league_id
      season
      stage
      date
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      ...
      IWA
      LBH
      LBD
      LBA
      WHH
      WHD
      WHA
      VCH
      VCD
      VCA
    
  
  
    
      0
      2228
      1729
      2009/2010
      2
      2010-03-24 00:00:00
      658592
      8456
      8668
      0
      2
      ...
      4.30
      1.80
      3.30
      3.75
      1.8
      3.30
      3.75
      1.95
      3.40
      4.00
    
    
      1
      2239
      1729
      2009/2010
      21
      2010-03-16 00:00:00
      658936
      8528
      10252
      1
      2
      ...
      2.20
      3.30
      3.25
      2.20
      3.4
      3.25
      2.15
      3.60
      3.25
      2.10
    
    
      2
      2240
      1729
      2009/2010
      21
      2010-03-23 00:00:00
      658937
      8654
      8602
      1
      3
      ...
      4.60
      1.80
      3.30
      3.75
      1.7
      3.30
      4.33
      1.80
      3.50
      4.75
    
    
      3
      2245
      1729
      2009/2010
      21
      2010-03-10 00:00:00
      658942
      8191
      10194
      1
      1
      ...
      2.85
      2.38
      3.20
      2.60
      2.4
      3.10
      2.60
      2.50
      3.25
      2.88
    
    
      4
      2246
      1729
      2009/2010
      21
      2010-03-09 00:00:00
      658943
      8472
      8559
      4
      0
      ...
      3.40
      2.10
      3.20
      3.00
      2.0
      3.20
      3.20
      2.10
      3.25
      3.75
    
  

5 rows × 36 columns



In [4]:

    
frames = list()

for index, row in df_match.iterrows():
    hteam=row.home_team_api_id
    ateam=row.away_team_api_id
    date=row.date ##match date
    hquery="SELECT * FROM team_attributes WHERE team_api_id="+str(hteam)
    aquery="SELECT * FROM team_attributes WHERE team_api_id="+str(ateam)
    df_hteam=pd.read_sql(sql=hquery,con=conn)
    df_ateam=pd.read_sql(sql=aquery,con=conn)
    df_hteam=df_hteam[df_hteam.date<=date]
    df_ateam=df_ateam[df_ateam.date<=date]
    df_hlastest= df_hteam.tail(1)
    df_alastest= df_ateam.tail(1)
    df_row=row.to_frame().T
    ##print df_row
    df_row_home=pd.merge(df_row, df_hlastest, how='outer',left_on='home_team_api_id', right_on='team_api_id', suffixes=('_match', '_home'))
    df_row_home_away=pd.merge(df_row_home,df_alastest, how='outer',left_on='away_team_api_id', right_on='team_api_id', suffixes=('_home', '_away'))
    frames.append(df_row_home_away)

    ##hteam_frames.append(df_hlastest)
    ##ateam_frames.append(df_alastest)

df_match_team = pd.concat(frames).reset_index(drop=True)
#df_ateams = pd.concat(ateam_frames)
df_match_team.head()









    Out[4]:







  
    
      
      id_match
      league_id
      season
      stage
      date_match
      match_api_id
      home_team_api_id
      away_team_api_id
      home_team_goal
      away_team_goal
      ...
      chanceCreationShooting_away
      chanceCreationShootingClass_away
      chanceCreationPositioningClass_away
      defencePressure_away
      defencePressureClass_away
      defenceAggression_away
      defenceAggressionClass_away
      defenceTeamWidth_away
      defenceTeamWidthClass_away
      defenceDefenderLineClass_away
    
  
  
    
      0
      2228
      1729
      2009/2010
      2
      2010-03-24 00:00:00
      658592
      8456
      8668
      0
      2
      ...
      45
      Normal
      Organised
      40
      Medium
      70
      Double
      40
      Normal
      Cover
    
    
      1
      2239
      1729
      2009/2010
      21
      2010-03-16 00:00:00
      658936
      8528
      10252
      1
      2
      ...
      50
      Normal
      Free Form
      30
      Deep
      70
      Double
      30
      Narrow
      Cover
    
    
      2
      2240
      1729
      2009/2010
      21
      2010-03-23 00:00:00
      658937
      8654
      8602
      1
      3
      ...
      70
      Lots
      Organised
      70
      High
      70
      Double
      70
      Wide
      Cover
    
    
      3
      2245
      1729
      2009/2010
      21
      2010-03-10 00:00:00
      658942
      8191
      10194
      1
      1
      ...
      55
      Normal
      Organised
      35
      Medium
      70
      Double
      35
      Normal
      Cover
    
    
      4
      2246
      1729
      2009/2010
      21
      2010-03-09 00:00:00
      658943
      8472
      8559
      4
      0
      ...
      45
      Normal
      Organised
      35
      Medium
      70
      Double
      35
      Normal
      Cover
    
  

5 rows × 86 columns



In [5]:

    
# Replace possession XML with the latest value
for index, row in df_match_team.iterrows():
    tree = ElementTree.fromstring(row['possession'])
    elapsed = -1
    possession = -1
    for value in tree:
        for tag in value:
            if tag.tag == 'elapsed':
                curelapsed = int(tag.text)
            if tag.tag == 'homepos':
                curpossession = tag.text
        if elapsed <= curelapsed:
            elapsed = curelapsed
            possession = curpossession
    df_match_team.loc[index, 'possession'] = possession
df_match_team = df_match_team.rename(columns = {'possession' : 'possession_home'})



In [6]:

    
def parse_xml(field):
    global df_match_team
    # Replace corner XML with the count
    if field == 'card':
        df_match_team['ycard_home'] = 0
        df_match_team['ycard_away'] = 0
        df_match_team['rcard_home'] = 0
        df_match_team['rcard_away'] = 0
    else:
        df_match_team['%s_home' % field] = 0
        df_match_team['%s_away' % field] = 0
        if field == 'cross':
            df_match_team['throwin_home'] = 0
            df_match_team['throwin_away'] = 0
    for index, row in df_match_team.iterrows():
        tree = ElementTree.fromstring(row[field])
        for value in tree:
            ignore = False
            away = ''
            cardtype = ''
            field_save = field
            for tag in value:
                if tag.tag == 'team':
                    current_team = int(tag.text)
                    if current_team == row['home_team_api_id']:
                        away = 'home'
                    elif current_team == row['away_team_api_id']:
                        away = 'away'
                    else:
                        print (current_team)
                        print (row)
                        assert False
                if (tag.tag == 'card_type' or tag.tag == 'comment') and field == 'card':
                    cardtype = tag.text
                    if cardtype == 'y2':
                        cardtype = 'r'
                if tag.tag == 'type':
                    if tag.text != field:
                        if field == 'cross' and tag.text == 'throwin':
                            field_save = tag.text
                        else:
                            if tag.text != 'corner':
                                print ('%s instead of %s' % (tag.text, field))
                            ignore = True
            if not ignore and (away == 'away' or away == 'home'):
                df_match_team.loc[index, '%s%s_%s' % (cardtype, field_save, away)] += 1
    df_match_team = df_match_team.drop(field, axis=1)
    
parse_xml('corner')
parse_xml('cross')
parse_xml('shoton')
parse_xml('shotoff')
parse_xml('foulcommit')
parse_xml('card')



In [7]:

    
# Drop IDs and dates
df_match_team=df_match_team.drop(labels = ['date', 'id', 'id_match', 'league_id', 'season', 'date_match', 'match_api_id',
                                           'home_team_api_id', 'away_team_api_id', 'team_fifa_api_id_home', 
                                           'team_api_id_home', 'team_fifa_api_id_away', 'team_api_id_away', 
                                           'id_home', 'date_home', 'goal'], axis = 1)

# Use Classes to fill missing values
replacements = []
for index, row in df_match_team.iterrows():
    for column in df_match_team:
        if pd.isnull(row[column]):
            classColumn = column[:-5] + 'Class' + column[-5:]
            if classColumn in df_match_team:
                replacement = df_match_team[column][df_match_team[classColumn] == row[classColumn]].median()
                replacements.append('In %s: replaced %s with %f' % (column, row[classColumn], replacement))
                df_match_team.loc[index, column] = replacement
for r in set(replacements):
    print (r)

# Drop columns that still have missing values                
df_match_team=df_match_team.dropna(axis = 1)

# Drop Classes what have numerical equvialents
for column in df_match_team:
    classColumn = column[:-5] + 'Class' + column[-5:]
    if classColumn in df_match_team:
        df_match_team = df_match_team.drop(classColumn, axis = 1)









    



In buildUpPlayDribbling_away: replaced Little with 32.000000
In buildUpPlayDribbling_home: replaced Little with 32.000000



In [8]:

    
for columnName in df_match_team:
    no_unique = len(df_match_team[columnName].unique())
    if no_unique == 1:
        print ('Dropping %s because it has only one 1 unique value' % columnName)
        df_match_team = df_match_team.drop(columnName, axis = 1)
    if no_unique == 2:
        mode = df_match_team[columnName].mode().values[0]
        df_match_team.loc[:,columnName] = df_match_team[columnName] == mode
        df_match_team = df_match_team.rename(columns={columnName: columnName + '_is' + mode})
        print ('Binarizing %s to %s_is%s' % (columnName, columnName, mode))









    



Binarizing buildUpPlayPositioningClass_home to buildUpPlayPositioningClass_home_isOrganised
Binarizing chanceCreationPositioningClass_home to chanceCreationPositioningClass_home_isOrganised
Dropping defenceDefenderLineClass_home because it has only one 1 unique value
Binarizing buildUpPlayPositioningClass_away to buildUpPlayPositioningClass_away_isOrganised
Binarizing chanceCreationPositioningClass_away to chanceCreationPositioningClass_away_isOrganised
Dropping defenceDefenderLineClass_away because it has only one 1 unique value



In [9]:

    
df_match_team.to_csv('dataset.csv', index=None)
df_match_team.dtypes









    Out[9]:





stage                                               object
home_team_goal                                      object
away_team_goal                                      object
possession_home                                     object
B365H                                               object
B365D                                               object
B365A                                               object
BWH                                                 object
BWD                                                 object
BWA                                                 object
IWH                                                 object
IWD                                                 object
IWA                                                 object
LBH                                                 object
LBD                                                 object
LBA                                                 object
WHH                                                 object
WHD                                                 object
WHA                                                 object
VCH                                                 object
VCD                                                 object
VCA                                                 object
buildUpPlaySpeed_home                                int64
buildUpPlayDribbling_home                          float64
buildUpPlayPassing_home                              int64
buildUpPlayPositioningClass_home_isOrganised          bool
chanceCreationPassing_home                           int64
chanceCreationCrossing_home                          int64
chanceCreationShooting_home                          int64
chanceCreationPositioningClass_home_isOrganised       bool
defencePressure_home                                 int64
defenceAggression_home                               int64
defenceTeamWidth_home                                int64
buildUpPlaySpeed_away                                int64
buildUpPlayDribbling_away                          float64
buildUpPlayPassing_away                              int64
buildUpPlayPositioningClass_away_isOrganised          bool
chanceCreationPassing_away                           int64
chanceCreationCrossing_away                          int64
chanceCreationShooting_away                          int64
chanceCreationPositioningClass_away_isOrganised       bool
defencePressure_away                                 int64
defenceAggression_away                               int64
defenceTeamWidth_away                                int64
corner_home                                          int64
corner_away                                          int64
cross_home                                           int64
cross_away                                           int64
throwin_home                                         int64
throwin_away                                         int64
shoton_home                                          int64
shoton_away                                          int64
shotoff_home                                         int64
shotoff_away                                         int64
foulcommit_home                                      int64
foulcommit_away                                      int64
ycard_home                                           int64
ycard_away                                           int64
rcard_home                                           int64
rcard_away                                           int64
dtype: object



In [10]:

    
conn.close()

	id	league_id	season	stage	date	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	...	IWA	LBH	LBD	LBA	WHH	WHD	WHA	VCH	VCD	VCA
0	2228	1729	2009/2010	2	2010-03-24 00:00:00	658592	8456	8668	0	2	...	4.30	1.80	3.30	3.75	1.8	3.30	3.75	1.95	3.40	4.00
1	2239	1729	2009/2010	21	2010-03-16 00:00:00	658936	8528	10252	1	2	...	2.20	3.30	3.25	2.20	3.4	3.25	2.15	3.60	3.25	2.10
2	2240	1729	2009/2010	21	2010-03-23 00:00:00	658937	8654	8602	1	3	...	4.60	1.80	3.30	3.75	1.7	3.30	4.33	1.80	3.50	4.75
3	2245	1729	2009/2010	21	2010-03-10 00:00:00	658942	8191	10194	1	1	...	2.85	2.38	3.20	2.60	2.4	3.10	2.60	2.50	3.25	2.88
4	2246	1729	2009/2010	21	2010-03-09 00:00:00	658943	8472	8559	4	0	...	3.40	2.10	3.20	3.00	2.0	3.20	3.20	2.10	3.25	3.75